Student Name: Zun Wang

Student ID: 915109847

Insert answers to R SNP exercises 1 - 4 here. Submit .Rmd and .html by git.

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
vcf.data <- read_tsv("output/SNP_analysis/IMB211_R500.vcf",na = c("","NA","."),comment="#",col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_double(),
##   X3 = col_logical(),
##   X4 = col_character(),
##   X5 = col_character(),
##   X6 = col_double(),
##   X7 = col_logical(),
##   X8 = col_character(),
##   X9 = col_character(),
##   X10 = col_character(),
##   X11 = col_character()
## )
vcf.header <- system("grep '#C' output/SNP_analysis/IMB211_R500.vcf",intern = TRUE) #might not work on Windows
vcf.header
## [1] "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tIMB211\tR500"
vcf.header <- vcf.header %>% str_replace("#","") #get rid of the pound sign

vcf.header <- vcf.header %>% str_split(pattern = "\t")
colnames(vcf.data) <- vcf.header[[1]] #we need the [[1]] because str_split returns a list and we want the first element
head(vcf.data)
vcf.data <- vcf.data %>%
  filter(str_detect(INFO, "TYPE=snp"))


vcf.data <- vcf.data %>% separate(IMB211,
                                  into = paste("IMB211",c("gt","tot.depth","allele.depth", "ref.depth","ref.qual","alt.depth","alt.qual","gt.lik"),sep="_"), # new column names
                                  sep=":", #separate on ":" 
                                  convert=TRUE #converts numeric columns to numeric
)
vcf.data <- vcf.data %>% separate(R500,
                                  into = paste("R500",c("gt","tot.depth","allele.depth","ref.depth","ref.qual","alt.depth","alt.qual","gt.lik"),sep="_"), # new column names
                                  sep=":", #separate on ":" 
                                  convert=TRUE #converts numeric columns to numeric
)

Exercise 1
To explore the quality of our data, make a histogram of genotype quality. It is hard to get a feel for the distribution of QUAL scores at the low end of the scale (less than 500) using the default settings, so try making a second histogram that illustrates this region better. (Hint: one option is to subset the data)

pl <- ggplot(data=vcf.data, aes(x=QUAL))
pl <- pl + geom_histogram()
pl
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

vcf.data2 <- filter(vcf.data, QUAL <= 500)
pl2 <- ggplot(data = vcf.data2,aes(x=QUAL))
pl2 <- pl2 + geom_histogram()
pl2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 2
We only want to keep positions that have a reasonable probabilty of being correct.

a At a quality of 40 what is the probability that the SNP call is wrong?

# P = 10 ^ (-4) = 0.0001

b Subset the data to only keep positions where the quality score is 40 or greater. Put the retained SNPs in an object called vcf.data.good

vcf.data.good <- filter(vcf.data,QUAL >= 40)

c What percentage of SNPs were retained?

nrow(vcf.data.good) / nrow(vcf.data) * 100
## [1] 77.88913
vcf.data.good %>% 
  filter(IMB211_gt != R500_gt, 
         IMB211_tot.depth > 20, 
         R500_tot.depth > 20) %>%
  select(CHROM, POS, REF, ALT, IMB211_gt, R500_gt)

Exercise 3: What do the “0/0”, “0/1”, and “1/1” values indicate? Use IGV to look at a few of the positions above (see lab page) and then explain what “0/0”, “0/1”, and “1/1” values indicate.

They are the genotype expression. 0/0 is homozygous reference, 1/1 is homozygous alternate, and 0/1 is heterozygous. Specifically, 0 means in this position, the base in the sample is same as reference; while 1 means the same as alternative.

table(vcf.data.good$IMB211_gt)
## 
##   0/0   0/1   0/2   1/1   1/2   1/3   2/2 
##  8587  1127     3 26973    33     1   110
table(vcf.data.good$R500_gt)
## 
##   0/0   0/1   0/2   1/1   1/2   2/2   3/3 
##  7045  1031     6 29202    40    91     1
vcf.data.good %>% select(IMB211_gt, R500_gt) %>% ftable
##           R500_gt   0/0   0/1   0/2   1/1   1/2   2/2   3/3
## IMB211_gt                                                  
## 0/0                   1   318     0  8263     5     0     0
## 0/1                 305   418     1   299     2     1     0
## 0/2                   0     1     0     1     1     0     0
## 1/1                6737   240     5 17220    19    89     0
## 1/2                   2     5     0    11    10     1     1
## 1/3                   0     0     0     0     1     0     0
## 2/2                   0     2     0   107     1     0     0

Exercise 4
a (From the table generated in the lab), which SNPS would be most useful for a downstream QTL analysis of F2 progeny generated from a cross of IMB211 and R500? (Ignore the allele categories that have “2”, “3”, or “4”). Hint: you want SNPs that will unambiguously distinguish a locus as coming from IMB211 or R500.

The cross between 1/1 and 0/0, while the order of 1/1 and 0/0 assigned to the group does not matter.

b Subset the vcf.data.good data frame so that you only have these SNPs. Place the results in vcf.data.good.F2

vcf.data.good.F2 <- filter(vcf.data.good, (IMB211_gt == "1/1" & R500_gt == "0/0")|(IMB211_gt == "0/0" & R500_gt == "1/1") )
vcf.data.good.F2

c How many SNPS are retained?

15000

Exercise 5
a Using the high quality F2 SNP list from Exercise 4 (vcf.data.good.F2), for each SNP plot its position on the chromosome (x axis), and total read depth (R500 and IMB211 combined) (y axis).

vcf.data.good.F2["totdepth"] <- vcf.data.good.F2$R500_tot.depth+vcf.data.good.F2$IMB211_tot.depth
pl3 <- ggplot(vcf.data.good.F2,aes(x = POS,y = totdepth))+geom_point()
pl3

Optional: color each SNP based on the percentage of reads that are R500. (optional part not graded).

vcf.data.good.F2["R500perc"] <- vcf.data.good.F2$R500_tot.depth/vcf.data.good.F2$totdepth
pl4 <- ggplot(vcf.data.good.F2,aes(x = POS,y = totdepth,color = R500perc))+geom_point()
pl4

b Use the help function to learn about xlim(). Use this function to plot only the region betweeen 20,000,000 and 25,000,000 bp. Why might there be gaps with no SNPs?

pl5 <- pl4 + xlim(20000000,25000000)
pl5
## Warning: Removed 12009 rows containing missing values (geom_point).

For Fun (??)–not graded–
Plot the number of each type of base change (A->G, etc). Are there differences? Is this expected?

baseChange <- data.frame(matrix(nrow = 12,ncol = 0))
baseChange["base_change"] <- c("AtoT","TtoA","AtoC","CtoA","AtoG","GtoA","TtoC","CtoT","TtoG","GtoT","CtoG","GtoC")

baseChange.ac <- select(vcf.data.good.F2,REF,ALT)
baseChange.ac$REF <- as.character(baseChange.ac$REF)
baseChange.ac$ALT <- as.character(baseChange.ac$ALT)
df_a <- baseChange.ac %>%
  mutate(REF_new=strsplit(REF, "")) %>% 
  unnest(REF_new)
baseChange.ac<- baseChange.ac %>%
  mutate(ALT=strsplit(ALT, "")) %>% 
  unnest(ALT)
baseChange.ac$REF <- df_a$REF_new
baseChange.ac["ALTnum"] <- 1
baseChange.ac["REFnum"] <- 1

baseChange.ac$REFnum[which(baseChange.ac$REF=="T")] <- 2
baseChange.ac$REFnum[which(baseChange.ac$REF=="C")] <- 4
baseChange.ac$REFnum[which(baseChange.ac$REF=="G")] <- 8
baseChange.ac$ALTnum[which(baseChange.ac$ALT=="T")] <- 2
baseChange.ac$ALTnum[which(baseChange.ac$ALT=="C")] <- 4
baseChange.ac$ALTnum[which(baseChange.ac$ALT=="G")] <- 8
baseChange.ac["diff"]<-baseChange.ac$REFnum - baseChange.ac$ALTnum
baseChange.ac
baseChange["count"]<-c(nrow(subset(baseChange.ac,diff == -1)),nrow(subset(baseChange.ac,diff == 1)),nrow(subset(baseChange.ac,diff == -3)),nrow(subset(baseChange.ac,diff == 3)),nrow(subset(baseChange.ac,diff == -7)),nrow(subset(baseChange.ac,diff == 7)),nrow(subset(baseChange.ac,diff == -2)),nrow(subset(baseChange.ac,diff == 2)),nrow(subset(baseChange.ac,diff == -6)),nrow(subset(baseChange.ac,diff == 6)),nrow(subset(baseChange.ac,diff == -4)),nrow(subset(baseChange.ac,diff == 4)))
baseChange
pl6 <- ggplot(baseChange,aes(x = base_change,y = count,fill = base_change))+geom_bar(stat = "identity")
pl6